{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TASK 3 (2014-2016) - Main Results\n",
    "\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import logging\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from joblib import dump, load\n",
    "\n",
    "import views\n",
    "from views import Ensemble, Model, Downsampling, Period\n",
    "from views.utils.data import assign_into_df\n",
    "from views.apps.transforms import lib as translib\n",
    "from views.apps.evaluation import lib as evallib, feature_importance as fi\n",
    "from views.apps.model import api\n",
    "from views.apps.extras import extras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = views.DATASETS[\"cm_africa_imp_0\"]\n",
    "df = dataset.df\n",
    "#print(df)\n",
    "level = \"cm\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"./models/eval_real/{sub}\"\n",
    "out_paths = {\n",
    "    \"evaluation3\": model_path.format(sub=\"evaluation3\"),\n",
    "    \"features3\": model_path.format(sub=\"features3\")\n",
    "}\n",
    "for k, v in out_paths.items():\n",
    "    if not os.path.isdir(v):\n",
    "        os.makedirs(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import our data\n",
    "wiki = pd.read_csv(r'/home/default/Dokumente/TRINITY/comp/prediction-project/data/Wiki_Data_Final_2021.csv')\n",
    "multi = wiki.set_index(['month_id', 'country_id'])\n",
    "multi = multi.sort_index(level=0)\n",
    "\n",
    "df = df.merge(multi, on=['month_id', 'country_id'], how='left')\n",
    "df = df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Partition Dataset in Training, Test and Calibration set\n",
    "period_calib = api.Period(\n",
    "   name=\"calib\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=372,     # 2013.12\n",
    "   predict_start=373, # 2014.01\n",
    "   predict_end=407,   # 2016.12\n",
    ")\n",
    "\n",
    "\n",
    "period_test = api.Period(\n",
    "   name=\"test\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=407,     # 2016.12\n",
    "   predict_start=409, # 2017.01\n",
    "   predict_end=444,   # 2019.12\n",
    ")\n",
    "\n",
    "\n",
    "periods = [period_calib, period_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The steps to train, predict and evaluate for.\n",
    "steps = [2,3,4,5,6,7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Subset \"most important\" features from the ViEWS-dataset and Wikipedia variables\n",
    "\n",
    "cols_features = df[['ged_best_sb','wdi_vc_btl_deth', 'reign_precip','reign_prev_conflict','reign_tenure_months',\n",
    "           'reign_irregular','reign_lastelection','reign_loss','reign_pctile_risk','reign_couprisk',\n",
    "          'ged_count_sb','ged_best_os','ged_count_os','wdi_eg_use_pcap_kg_oe','ged_best_ns',\n",
    "           'vdem_v2x_accountability', 'wdi_nv_srv_totl_zs','wdi_sp_pop_totl','wdi_sl_tlf_totl_fe_zs',\n",
    "          'wdi_sm_pop_totl_zs', 'wdi_sm_pop_refg_or', 'wdi_dt_oda_odat_pc_zs', 'ged_count_ns', \n",
    "           'reign_gov_dominant_party', 'reign_age', 'vdem_v2xpe_exlpol', 'wdi_ag_lnd_frst_k2', \n",
    "           'wdi_bg_gsr_nfsv_gd_zs', 'wdi_st_int_rcpt_xp_zs', 'vdem_v2x_clpol', 'vdem_v2x_genpp','viewsEnglish']]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify number of estimators in Random Forest\n",
    "n_estimators = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Model\n",
    "\n",
    "task3_delta = api.Model(\n",
    "    name=\"task3_delta\",                \n",
    "    col_outcome=\"ln_ged_best_sb\",    \n",
    "    cols_features=cols_features,     \n",
    "    steps=steps,                     \n",
    "    outcome_type=\"real\",             \n",
    "    periods=periods,                 \n",
    "    estimator=RandomForestRegressor( \n",
    "        n_estimators=n_estimators,\n",
    "        criterion=\"mse\",\n",
    "        n_jobs=-1,\n",
    "    ),\n",
    "    delta_outcome=True,            \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [task3_delta]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12min 27s, sys: 11.1 s, total: 12min 38s\n",
      "Wall time: 13min 13s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Train all models\n",
    "for model in models:\n",
    "    model.fit_estimators(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store predictions and calibrated predictions for all models in our dataframe\n",
    "for model in models:\n",
    "    df_predictions = model.predict(df)\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "    df_predictions = model.predict_calibrated(\n",
    "        df=df,\n",
    "        period_calib = period_calib,\n",
    "        period_test = period_test\n",
    "    )\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate all models. Scores are stored in the model object\n",
    "for model in models:\n",
    "    model.evaluate(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select test partition and compute model performance\n",
    "partition = \"test\"\n",
    "\n",
    "for model in models:\n",
    "    for calib in [\"uncalibrated\"]:\n",
    "        scores = {\n",
    "            \"Step\":[], \n",
    "            \"MSE\":[]\n",
    "        }\n",
    "        if model.delta_outcome:\n",
    "            scores.update({\"TADDA\":[]}) \n",
    "            \n",
    "        for key, value in model.scores[partition].items():\n",
    "            if key != \"sc\":\n",
    "                scores[\"Step\"].append(key)\n",
    "                scores[\"MSE\"].append(value[calib][\"mse\"])\n",
    "                if model.delta_outcome:\n",
    "                    scores[\"TADDA\"].append(value[calib][\"tadda_score\"])\n",
    "\n",
    "        out = pd.DataFrame(scores)\n",
    "        tex = out.to_latex(index=False)\n",
    "\n",
    "        # Add meta.\n",
    "        now = datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")\n",
    "        meta = f\"\"\"\n",
    "        %Output created by wb_models.ipynb.\n",
    "        %Evaluation of {model.col_outcome} per step.\n",
    "        %Run on selected {model.name} features at {level} level.\n",
    "        %Produced on {now}, written to {out_paths[\"evaluation3\"]}.\n",
    "        \\\\\n",
    "        \"\"\"\n",
    "        tex = meta + tex\n",
    "        path_out = os.path.join(\n",
    "            out_paths[\"evaluation3\"], \n",
    "            f\"{model.name}_{level}_{calib}_scores.tex\"\n",
    "        )\n",
    "     #   with open(path_out, \"w\") as f:\n",
    "      #     f.write(tex)\n",
    "       #    print(f\"Wrote scores table to {path_out}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Step       MSE     TADDA\n",
      "0     2  0.904365  0.590441\n",
      "1     3  0.917013  0.580471\n",
      "2     4  0.836393  0.552172\n",
      "3     5  0.912593  0.581012\n",
      "4     6  1.111199  0.622698\n",
      "5     7  1.013837  0.591159\n"
     ]
    }
   ],
   "source": [
    "print(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Permutation Feature Importance\n",
    "\n",
    "sort_step = 3\n",
    "top = 35\n",
    "\n",
    "for model in models:\n",
    "    for step in steps:\n",
    "        pi_dict = model.extras.permutation_importances[\"test\"][step][\"test\"]\n",
    "        step_df = pd.DataFrame(fi.reorder_fi_dict(pi_dict))\n",
    "        step_df = step_df.rename(columns={\"importance\": f\"s={step}\"})\n",
    "        step_df.set_index(\"feature\", inplace=True)\n",
    "        pi_df = pi_df.join(step_df) if step > steps[0] else step_df.copy()\n",
    "    \n",
    "    pi_df = pi_df.sort_values(by=[f\"s={sort_step}\"], ascending=False)\n",
    "    pi_df = pi_df[0:top + 1]\n",
    "    \n",
    "    fi.write_fi_tex(\n",
    "        pi_df, \n",
    "         os.path.join(out_paths[\"features3\"], f\"impurity_imp_{model.name}_{level}.tex\")\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                               s=2       s=3       s=4       s=5       s=6  \\\n",
      "feature                                                                      \n",
      "wdi_vc_btl_deth           0.547520  0.447588  0.496458  0.461464  0.477414   \n",
      "reign_prev_conflict       0.253293  0.273838  0.311067  0.263445  0.261744   \n",
      "wdi_sm_pop_refg_or        0.153911  0.234700  0.266033  0.196788  0.194692   \n",
      "ged_count_sb              0.122605  0.156398  0.126836 -0.010598  0.093980   \n",
      "wdi_eg_use_pcap_kg_oe     0.139877  0.130832  0.141962  0.138358  0.147654   \n",
      "wdi_bg_gsr_nfsv_gd_zs     0.073841  0.093193  0.154164  0.210949  0.089303   \n",
      "wdi_sm_pop_totl_zs        0.063179  0.086828  0.090542  0.101334  0.092308   \n",
      "reign_loss                0.016360  0.045726  0.057545  0.066384  0.060318   \n",
      "wdi_dt_oda_odat_pc_zs     0.043317  0.037324  0.035327  0.048144  0.061515   \n",
      "wdi_st_int_rcpt_xp_zs     0.023553  0.036446  0.028750  0.029186  0.032311   \n",
      "wdi_nv_srv_totl_zs        0.013595  0.025834  0.021870  0.031816  0.016026   \n",
      "wdi_sp_pop_totl           0.030760  0.021002  0.031692  0.034555  0.033859   \n",
      "ged_count_os              0.013588  0.018597  0.008079  0.021789  0.008959   \n",
      "wdi_sl_tlf_totl_fe_zs     0.011590  0.013498  0.014084  0.019777  0.021986   \n",
      "vdem_v2x_accountability   0.003015  0.009964  0.007013  0.010464  0.007386   \n",
      "vdem_v2x_genpp            0.006356  0.009061  0.008924  0.005238  0.025054   \n",
      "ged_best_os               0.041936  0.008504  0.023060  0.020269  0.008100   \n",
      "reign_lastelection        0.011517  0.005069  0.007855  0.002959  0.007607   \n",
      "ged_best_ns               0.007588  0.003285 -0.001625  0.009435  0.022682   \n",
      "reign_precip              0.006784  0.002722  0.004758  0.010679  0.007611   \n",
      "wdi_ag_lnd_frst_k2        0.003863  0.001746  0.011863  0.011056  0.009980   \n",
      "vdem_v2xpe_exlpol         0.009096  0.001633  0.010324  0.022482  0.030879   \n",
      "reign_pctile_risk         0.003369  0.001468  0.002290  0.002546  0.003708   \n",
      "reign_couprisk           -0.001312  0.000801  0.002053 -0.001751 -0.001154   \n",
      "ged_count_ns              0.010608  0.000668 -0.000513  0.004471  0.003258   \n",
      "reign_gov_dominant_party  0.000239  0.000621  0.000199  0.000355  0.001500   \n",
      "vdem_v2x_clpol            0.004874  0.000308  0.005530  0.013476  0.004354   \n",
      "views                    -0.001676 -0.000660  0.000894 -0.003699 -0.003755   \n",
      "reign_irregular           0.026440 -0.003871  0.013864  0.014098  0.007387   \n",
      "reign_age                 0.005388 -0.004296  0.008859  0.007072 -0.014136   \n",
      "reign_tenure_months      -0.012270 -0.013588  0.000121 -0.003954  0.012926   \n",
      "ged_best_sb              -0.606335 -0.596400 -0.504180 -0.426055 -0.441024   \n",
      "\n",
      "                               s=7  \n",
      "feature                             \n",
      "wdi_vc_btl_deth           0.466424  \n",
      "reign_prev_conflict       0.265068  \n",
      "wdi_sm_pop_refg_or        0.164663  \n",
      "ged_count_sb              0.069576  \n",
      "wdi_eg_use_pcap_kg_oe     0.120114  \n",
      "wdi_bg_gsr_nfsv_gd_zs     0.111168  \n",
      "wdi_sm_pop_totl_zs        0.098138  \n",
      "reign_loss                0.035860  \n",
      "wdi_dt_oda_odat_pc_zs     0.029310  \n",
      "wdi_st_int_rcpt_xp_zs     0.015177  \n",
      "wdi_nv_srv_totl_zs        0.021005  \n",
      "wdi_sp_pop_totl           0.056720  \n",
      "ged_count_os              0.003662  \n",
      "wdi_sl_tlf_totl_fe_zs     0.008056  \n",
      "vdem_v2x_accountability   0.008242  \n",
      "vdem_v2x_genpp            0.010876  \n",
      "ged_best_os               0.008123  \n",
      "reign_lastelection        0.003221  \n",
      "ged_best_ns               0.007139  \n",
      "reign_precip              0.006024  \n",
      "wdi_ag_lnd_frst_k2        0.003888  \n",
      "vdem_v2xpe_exlpol         0.007057  \n",
      "reign_pctile_risk        -0.002164  \n",
      "reign_couprisk           -0.005781  \n",
      "ged_count_ns              0.002603  \n",
      "reign_gov_dominant_party  0.002859  \n",
      "vdem_v2x_clpol           -0.007982  \n",
      "views                    -0.000618  \n",
      "reign_irregular           0.010714  \n",
      "reign_age                 0.007605  \n",
      "reign_tenure_months       0.012743  \n",
      "ged_best_sb              -0.492083  \n"
     ]
    }
   ],
   "source": [
    "print(pi_df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
